# Mechanisms_MediaCoverage_Heterogeneity.R
# Goal: Check whether COVID fiscal aid impacted media coverage differently by aid bill
# T. M. J. Ahokovi
# Date Created: 2024-06-12
# Last Updated: 2024-08-20 \\ tja

rm(list = ls())

# Set up directories

# uncomment the line below and add working directory path between the quotation marks
# master_directory = "ADD WORKING DIRECTORY HERE/"

data_folder = paste0(master_directory,"Data/")
do_folder = paste0(master_directory,"Do/")
results_folder = paste0(master_directory,"Results/")
charts_folder = paste0(master_directory,"Charts/")

# Load libraries
library(tidyverse)
library(janitor)
library(readxl)
library(cdlTools)
library(fuzzyjoin)
library(haven)
library(rvest)
library(Quandl)
library(lubridate)
library(openxlsx)
library(lmtest)
library(sandwich)
library(AER)
library(tidystringdist)
library(stargazer)
library(rio)
library(ropensecretsapi)
library(pdftools)

# Read the media coverage data from an Excel file located in the data_folder directory
media_coverage_states <- read_xlsx(paste0(data_folder, "Raw/media coverage by state.xlsx")) %>%
  # Rename the columns for easier reference in the analysis
  rename(
    year = year,                         
    state = state,                       
    office = office,                     
    incumbent = Incumbent,               
    inc_search = `Incumbent Search Term`, # Search term used for finding incumbent-related media
    CARES = `CARES Act`,                 
    CARES_inc_state = `CARES and Incumbent State`,
    CARES_inc = `CARES and Incumbent`,   
    ARPA = `American Resuce Plan`,       
    ARPA_inc_state = `American Rescue and Incumbent State`,
    ARPA_inc = `American Rescue and Incumbent`
  )

# Read a Stata (.dta) file containing state election panel data
df_main1 <- read_dta(paste0(data_folder, "Processing/state_elections_panel_df5_20240731.dta"))

# Select relevant columns from the dataset for further analysis
# The select() function is used to keep only the necessary columns for this analysis
df_main2 <- df_main1 %>%
  select(
    year,
    gub_year,  # Ensure gub_year is included for later merging
    state,                       
    office,                      
    senate_class,                # The class of the senate seat (important for staggered elections)
    norm_vote_top2_share_sd,     # Normal vote control variable
    total_1k_muni_aid_per_resident,  # Total aid per resident (in thousands of dollars)
    reps_per_million,             # Number of representatives per million residents (used as an instrumental variable)
    pop
  )

# Merge the selected election data with the media coverage data
# left_join() combines the two datasets based on shared columns (year, state, office, and senate_class)
df_main3 <- df_main2 %>%
  left_join(media_coverage_states, by = c("year", "state", "office", "senate_class"))

# Filter out gubernatorial races, ensure gub_year is set, and prepare for merging
# Here we focus on gubernatorial races and prepare this data for comparison with senatorial races
df_gub <- df_main3 %>%
  filter(office == "gub") %>%
  # Ensure that gub_year is set to year for gubernatorial races
  mutate(gub_year = ifelse(office == "gub", year, gub_year)) %>%
  select(state, gub_year, office, CARES, CARES_inc_state, CARES_inc, ARPA, ARPA_inc_state, ARPA_inc) %>%
  # Reassign office to "senate" for the purpose of merging with the senatorial data
  mutate(office = "senate")  

# Merge the modified gubernatorial data back with the original dataset containing senatorial races
# The suffix argument adds "_sen" or "_gub" to distinguish between columns with the same names
df_main4 <- df_main3 %>%
  filter(office == "senate") %>%
  left_join(df_gub, by = c("state", "gub_year", "office"), suffix = c("_sen", "_gub")) %>%
  # Compute various ratios and totals to compare gubernatorial and senatorial races
  mutate(
    total_CARES_ARPA_inc_gub = CARES_inc_gub + ARPA_inc_gub,  
    total_CARES_ARPA_inc_sen = CARES_inc_sen + ARPA_inc_sen,
    
    # Create a new variable that sums the total stories from gubernatorial and senatorial races
    total_stories = (total_CARES_ARPA_inc_gub + total_CARES_ARPA_inc_sen)/pop,
    
    # Calculate ratios comparing gubernatorial to senatorial media coverage
    ratio_total_CARES_ARPA_inc = total_CARES_ARPA_inc_gub / total_CARES_ARPA_inc_sen, 
    ratio_total_CARES_inc = CARES_inc_gub / CARES_inc_sen,  
    ratio_total_ARPA_inc = ARPA_inc_gub / ARPA_inc_sen,  
    
    total_CARES_ARPA_inc_state_gub = CARES_inc_state_gub + ARPA_inc_state_gub,  
    total_CARES_ARPA_inc_state_sen = CARES_inc_state_sen + ARPA_inc_state_sen,  
    
    ratio_total_CARES_ARPA_inc_state = total_CARES_ARPA_inc_state_gub / total_CARES_ARPA_inc_state_sen,
    ratio_total_CARES_inc_state = CARES_inc_state_gub / CARES_inc_state_sen,  
    ratio_total_ARPA_inc_state = ARPA_inc_state_gub / ARPA_inc_state_sen,  
    
    total_CARES_ARPA_gub = CARES_gub + ARPA_gub,  
    total_CARES_ARPA_sen = CARES_sen + ARPA_sen,  
    
    ratio_total_CARES_ARPA = total_CARES_ARPA_gub / total_CARES_ARPA_sen,
    ratio_total_CARES = CARES_gub / CARES_sen,  
    ratio_total_ARPA = ARPA_gub / ARPA_sen  
  )

# Add senate-specific and senate class non-specific lagged variables for each ratio
# Lagged variables capture the value of a variable from the previous time period
df_main5 <- df_main4 %>%
  filter(year <= 2022 & year >= 2020) %>%
  arrange(state, office, year) %>%  # Arrange the data by state, office, and year
  group_by(state, office) %>%  # Group the data by state and office for calculating lagged variables
  mutate(
    lagged_ratio_total_CARES_ARPA_inc = lag(ratio_total_CARES_ARPA_inc),  
    lagged_ratio_total_CARES_inc = lag(ratio_total_CARES_inc),  
    lagged_ratio_total_ARPA_inc = lag(ratio_total_ARPA_inc),  
    lagged_ratio_total_CARES_ARPA_inc_state = lag(ratio_total_CARES_ARPA_inc_state),
    lagged_ratio_total_CARES_inc_state = lag(ratio_total_CARES_inc_state),  
    lagged_ratio_total_ARPA_inc_state = lag(ratio_total_ARPA_inc_state),
    lagged_ratio_total_CARES_ARPA = lag(ratio_total_CARES_ARPA),  
    lagged_ratio_total_CARES = lag(ratio_total_CARES),  
    lagged_ratio_total_ARPA = lag(ratio_total_ARPA)  
  ) %>%
  ungroup() %>%  # Remove grouping to avoid affecting subsequent operations
  arrange(state, office, senate_class, year) %>%  # Arrange data by state, office, senate class, and year
  group_by(state, office, senate_class) %>%  # Group by state, office, and senate class for further lagged variables
  mutate(
    lagged_ratio_total_CARES_ARPA_inc_class_spec = lag(ratio_total_CARES_ARPA_inc),
    lagged_ratio_total_CARES_inc_class_spec = lag(ratio_total_CARES_inc),
    lagged_ratio_total_ARPA_inc_class_spec = lag(ratio_total_ARPA_inc),
    lagged_ratio_total_CARES_ARPA_inc_state_class_spec = lag(ratio_total_CARES_ARPA_inc_state),
    lagged_ratio_total_CARES_inc_state_class_spec = lag(ratio_total_CARES_inc_state),
    lagged_ratio_total_ARPA_inc_state_class_spec = lag(ratio_total_ARPA_inc_state),
    lagged_ratio_total_CARES_ARPA_class_spec = lag(ratio_total_CARES_ARPA), 
    lagged_ratio_total_CARES_class_spec = lag(ratio_total_CARES),  
    lagged_ratio_total_ARPA_class_spec = lag(ratio_total_ARPA)  
  ) %>%
  ungroup()  # Remove grouping to finalize the data

# 2SLS ----------------------------------------------------------------------- #

# Define a function to run the IV (Instrumental Variable) model with different configurations
# The function takes two arguments: the outcome variable and whether to include a control variable.
run_iv_model <- function(outcome_var, include_control = TRUE) {
  
  # Create the formula for the IV model based on whether the control variable is included.
  # If 'include_control' is TRUE, the control variable 'norm_vote_top2_share_sd' is included in the formula.
  if (include_control) {
    formula <- as.formula(paste0(outcome_var, " ~ total_1k_muni_aid_per_resident + norm_vote_top2_share_sd | 
                                  reps_per_million + norm_vote_top2_share_sd"))
  } else {
    # If 'include_control' is FALSE, the control variable is omitted from the formula.
    formula <- as.formula(paste0(outcome_var, " ~ total_1k_muni_aid_per_resident | 
                                  reps_per_million"))
  }
  
  # Run the IV model using the 'ivreg' function from the AER package.
  # The formula and data are passed to the function.
  iv_model <- ivreg(formula, data = df_main5)
  
  # Calculate clustered standard errors at the state level.
  clustered_se <- coeftest(iv_model, vcov = vcovCL(iv_model, cluster = ~ state))
  
  # Return the IV model object.
  return(iv_model)
}

# Define the outcome variables that will be tested in the IV models.
# These are the ratios calculated earlier: overall CARES + ARPA, CARES, and ARPA.
outcome_vars <- c("ratio_total_CARES_ARPA_inc", "ratio_total_CARES_inc", "ratio_total_ARPA_inc")

# Initialize an empty list to store the results of the IV models.
models <- list()

# Loop through each outcome variable and run the IV model twice:
# Once with the control variable and once without.
for (outcome in outcome_vars) {
  cat("\nRunning IV model for outcome:", outcome, "with control:\n")
  # Run the IV model with the control variable included and store the result in the models list.
  models[[paste0(outcome, "_with_control")]] <- run_iv_model(outcome_var = outcome, include_control = TRUE)
  
  cat("\nRunning IV model for outcome:", outcome, "without control:\n")
  # Run the IV model without the control variable and store the result in the models list.
  models[[paste0(outcome, "_without_control")]] <- run_iv_model(outcome_var = outcome, include_control = FALSE)
}

# Use stargazer to summarize the models in a table format.
stargazer(models,
          type = "text",  # Output as plain text (can be changed to 'html' or 'latex' for different formats)
          omit = "Constant",  # Omit the constant (intercept) term from the table
          covariate.labels = c("Total Aid per Resident (USD thousands)", "Normal Vote", "Reps per Million"),
          dep.var.labels = c("CARES + ARPA", "CARES", "ARPA"),  # Labels for the dependent variables
          model.names = FALSE,  # Do not include model names in the table
          no.space = F,  # Remove extra spaces in the table output
          header = FALSE,  # Do not include a header in the table output
          omit.stat = c("adj.rsq", "ser")) # Omit adjusted R-squared and standard error statistics
          # out = "ADD FILE PATH HERE")

stargazer(models,
          type = "html",  # Output as HTML (can be changed to 'text' or 'latex' for different formats)
          omit = "Constant",  # Omit the constant (intercept) term from the table
          covariate.labels = c("Total Aid per Resident (USD thousands)", "Normal Vote", "Reps per Million"),
          dep.var.labels = c("CARES + ARPA", "CARES", "ARPA"),  # Labels for the dependent variables
          model.names = FALSE,  # Do not include model names in the table
          no.space = F,  # Remove extra spaces in the table output
          header = FALSE,  # Do not include a header in the table output
          omit.stat = c("adj.rsq", "ser"))  # Omit adjusted R-squared and standard error statistics
          # out = "ADD FILE PATH HERE")

stargazer(models,
          type = "latex",  # Output as Latex (can be changed to 'html' or 'text' for different formats)
          omit = "Constant",  # Omit the constant (intercept) term from the table
          covariate.labels = c("Total Aid per Resident (USD thousands)", "Normal Vote", "Reps per Million"),
          dep.var.labels = c("CARES + ARPA", "CARES", "ARPA"),  # Labels for the dependent variables
          model.names = FALSE,  # Do not include model names in the table
          no.space = F,  # Remove extra spaces in the table output
          header = FALSE,  # Do not include a header in the table output
          omit.stat = c("adj.rsq", "ser")) # Omit adjusted R-squared and standard error statistics

# Total coverage ------------------------------------------------------------- #

# Aggregating data to get total counts of COVID aid coverage
df_total_coverage <- df_main3 %>%
  filter(year >= 2020 & year <= 2022) %>%
  mutate(
    total_CARES_ARPA_inc = CARES_inc + ARPA_inc,
    total_CARES_ARPA_inc_state = CARES_inc_state + ARPA_inc_state,
    total_CARES_ARPA = CARES + ARPA
  ) %>%
  select(state, year, total_CARES_ARPA_inc, total_CARES_ARPA_inc_state, total_CARES_ARPA, 
         norm_vote_top2_share_sd, reps_per_million, total_1k_muni_aid_per_resident) %>%  # Select the needed variables
  group_by(state, year) %>%
  summarize(
    total_CARES_ARPA_inc = sum(total_CARES_ARPA_inc, na.rm = TRUE),
    total_CARES_ARPA_inc_state = sum(total_CARES_ARPA_inc_state, na.rm = TRUE),
    total_CARES_ARPA = sum(total_CARES_ARPA, na.rm = TRUE),
    total_aid_per_capita = mean(total_1k_muni_aid_per_resident, na.rm = TRUE) * 1000,  # Convert to actual dollars
    norm_vote_top2_share_sd = first(norm_vote_top2_share_sd),  # Assuming it doesn't vary within state and year
    reps_per_million = first(reps_per_million)  # Assuming it doesn't vary within state and year
  ) %>%
  ungroup()  # Ungroup to avoid unintended group-wise operations later


# 2SLS regression using ivreg() to predict total media coverage without the state population control
model_total_coverage_2sls <- ivreg(
  total_CARES_ARPA_inc ~ total_aid_per_capita + norm_vote_top2_share_sd | reps_per_million + norm_vote_top2_share_sd,
  data = df_total_coverage
)

summary(model_total_coverage_2sls)

# Run the same analysis for the other coverage types
model_total_coverage_state_2sls <- ivreg(
  total_CARES_ARPA_inc_state ~ total_aid_per_capita + norm_vote_top2_share_sd | reps_per_million + norm_vote_top2_share_sd,
  data = df_total_coverage
)

model_total_CARES_ARPA_2sls <- ivreg(
  total_CARES_ARPA ~ total_aid_per_capita + norm_vote_top2_share_sd | reps_per_million + norm_vote_top2_share_sd,
  data = df_total_coverage
)

# Use stargazer to summarize the 2SLS models in a table format
# stargazer(
#   model_total_coverage_2sls, model_total_coverage_state_2sls, model_total_CARES_ARPA_2sls,
#   type = "text", 
#   omit = "Constant",
#   covariate.labels = c("Total Aid per Capita (USD)"),
#   dep.var.labels = c("Incumb.-Specific", "Incumb. + State-Specific", "State"),
#   model.names = FALSE,
#   no.space = F,
#   header = FALSE,
#   omit.stat = c("adj.rsq", "ser"),
#   out = "ADD FILE PATH HERE"
# )
